#1. Load the libraies
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(methods)
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.3.2
## Loading required package: limma
## Warning: package 'limma' was built under R version 4.3.1
library(Seurat)
## Warning: package 'Seurat' was built under R version 4.3.3
## Loading required package: SeuratObject
## Warning: package 'SeuratObject' was built under R version 4.3.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.3.2
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rafalib)
library(devtools)
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.3
##
## Attaching package: 'devtools'
## The following object is masked from 'package:rafalib':
##
## install_bioc
#2. Import data # Change the directories for your data
GSE146771_CRC_Leukocyte_10x_Metadata<- read.csv("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/GSE146771_CRC.Leukocyte.10x.Metadata.txt", sep = "\t")
vec.cell.type <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
names(vec.cell.type) <- GSE146771_CRC_Leukocyte_10x_Metadata$Global_Cluster
#counts
#GSE146771_CRC_Leukocyte_10x_TPM<- read.table("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/GSE146771_CRC.Leukocyte.10x.TPM.txt", fill = TRUE, header = TRUE, sep = "")
#saveRDS(GSE146771_CRC_Leukocyte_10x_TPM, file = "/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/data/GSE146771_CRC_Leukocyte_10x_TPM.rds")
GSE146771_CRC_Leukocyte_10x_TPM<-readRDS("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/data/GSE146771_CRC_Leukocyte_10x_TPM.rds")
#3.create a seurat project
#create a seurat object
dim(GSE146771_CRC_Leukocyte_10x_TPM) #dim of data
## [1] 13538 43817
pbmc <- CreateSeuratObject(counts = GSE146771_CRC_Leukocyte_10x_TPM, project = "CRC", min.cells = 3, min.features = 200)
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
pbmc
## An object of class Seurat
## 13538 features across 43817 samples within 1 assay
## Active assay: RNA (13538 features, 0 variable features)
## 1 layer present: counts
#4.add variables to the seurat object #add the variable to the metadata(Sub_ClusterID, Global_Cluster, CellName) #CellName
cell.type <- GSE146771_CRC_Leukocyte_10x_Metadata$Global_Cluster
names(cell.type) <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
pbmc <- AddMetaData(object = pbmc, metadata = cell.type, col.name = "cell_type")
#Sub_ClusterID
tumor.name <- GSE146771_CRC_Leukocyte_10x_Metadata$Sub_ClusterID
names(tumor.name) <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
pbmc <- AddMetaData(object = pbmc, metadata = tumor.name, col.name = "tumor_name")
#5. Preprocessing Steps #5.1 calculate mitocondria percentage
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
#5.2FeatureScatter plot #FeatureScatter is typically used to visualize gene-gene relationships, but can be used for anything calculated by the object
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
par(mfrow = c(1, 2))
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
#5.3filter out cells that have unique gene counts (nFeature_RNA) over 2,500 or less than
# 200 Note that > and < are used to define a'gate'.
#-Inf and Inf should be used if you don't want a lower or upper threshold.
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- SetIdent(pbmc, value = pbmc@meta.data$cell_type)
#5.4 Normalization step
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
#5.5 Detection of variable genes across the single cells
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
#5.6 Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
#5.7 plot variable features with and without labels
par(mfrow = c(1, 2))
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
#6. scale Data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
#6.2 Perform linear dimensional reduction (PCA)
#perform PCA on the scaled data
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: IL32, CD3D, CD2, RPL3, CD3E, CD3G, LCK, MALAT1, CD69, CCL5
## CD7, GZMA, IL2RG, EVL, RPSA, KLRB1, CST7, STK17A, ITM2A, CLEC2D
## IL7R, CXCR4, RORA, GZMM, LDHB, ISG20, CD247, NKG7, JUN, LTB
## Negative: CST3, LYZ, AIF1, SERPINA1, LST1, FCN1, CSTA, FCER1G, TYROBP, MNDA
## CD14, S100A9, S100A8, VCAN, CFD, FPR1, RETN, IFITM3, MS4A6A, CYBB
## PSAP, CLEC12A, SPI1, S100A12, MS4A7, GRN, CD68, CTSS, TYMP, FGL2
## PC_ 2
## Positive: CD79A, DERL3, TNFRSF17, MZB1, JCHAIN, TXNDC5, PRDX4, HERPUD1, FOSB, EAF2
## SPAG4, UBE2J1, SEC11C, SDC1, RRBP1, BTG2, FKBP2, SSR3, LMAN1, JUN
## TNFRSF13B, FKBP11, MEF2C, ITM2C, HSP90B1, NR4A1, SPCS3, XBP1, CD38, FAM46C
## Negative: TMSB4X, S100A4, GIMAP7, NKG7, HCST, IL32, GZMH, SH3BGRL3, ACTB, CST7
## GZMA, CCL5, GIMAP4, PTPRC, PFN1, ANXA1, CD2, FYB, FGFBP2, KLRD1
## CD3D, IFITM2, CORO1A, GZMM, S100A6, TMSB10, PRF1, CD52, CD3G, MYL12A
## PC_ 3
## Positive: TNFAIP3, CREM, CXCL2, NR4A2, RGS1, CXCL8, OLR1, C15ORF48, DUSP2, SAMSN1
## CCL20, PHLDA1, CXCL3, ZNF331, IL1B, ZFP36, G0S2, AREG, CCL3L3, SPP1
## DNAJB1, APOC1, RNASE1, VEGFA, DNAJA1, IER3, EREG, APOE, FOSB, HBEGF
## Negative: CD79A, MEF2C, CD79B, TNFRSF17, MS4A1, MZB1, DERL3, BANK1, VPREB3, TXNIP
## EAF2, TNFRSF13B, MALAT1, CD74, RALGPS2, TXNDC5, RPL3, PLAC8, KIAA0125, LINC00926
## PRDX4, SPIB, SEC11C, FCRLA, UBE2J1, RPS5, TCL1A, FTL, TNFRSF13C, TMSB10
## PC_ 4
## Positive: FKBP11, DERL3, TNFRSF17, CD63, NKG7, RRBP1, SDF2L1, GZMH, PRDX4, TXNDC5
## XBP1, GZMA, MZB1, LGALS1, VIMP, GZMB, KLRD1, MANF, FKBP2, CTSW
## CCL5, SEC11C, CCL4, FGFBP2, CST7, SPAG4, SDC1, ITM2C, GSTP1, MYDGF
## Negative: MS4A1, LTB, BANK1, VPREB3, CD22, CXCR4, TCL1A, CCR7, SPIB, HLA-DQB1
## LINC00926, CD24, CD37, GPR183, CD83, HLA-DMB, TNFRSF13C, SWAP70, HLA-DQA1, LAPTM5
## FCER2, CD72, CD52, BTG1, CD79B, SMIM14, HLA-DRA, TLR10, PABPC1, REL
## PC_ 5
## Positive: FOS, IL7R, S100A12, DUSP1, FYB, ICOS, VIM, JAML, FCN1, SARAF
## ITM2B, VCAN, CLEC12A, CD40LG, SRGN, TNFRSF4, CYP1B1, PBXIP1, CITED2, TBC1D4
## TNFRSF18, JUNB, CD28, RETN, RGCC, TRAT1, TNFRSF25, CFP, SPOCK2, CD27
## Negative: HLA-DPB1, HLA-DQA1, HLA-DPA1, HLA-DQB1, MS4A1, HLA-DQA2, GZMH, FGFBP2, HLA-DRB1, CCL4
## C15ORF48, HLA-DMB, OLR1, APOC1, NKG7, BANK1, APOE, SPP1, GPNMB, C1QC
## CXCL2, HLA-DRA, CD83, KLRD1, KLRF1, RNASE1, CXCL3, GZMB, GNLY, VPREB3
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: IL32, CD3D, CD2, RPL3, CD3E
## Negative: CST3, LYZ, AIF1, SERPINA1, LST1
## PC_ 2
## Positive: CD79A, DERL3, TNFRSF17, MZB1, JCHAIN
## Negative: TMSB4X, S100A4, GIMAP7, NKG7, HCST
## PC_ 3
## Positive: TNFAIP3, CREM, CXCL2, NR4A2, RGS1
## Negative: CD79A, MEF2C, CD79B, TNFRSF17, MS4A1
## PC_ 4
## Positive: FKBP11, DERL3, TNFRSF17, CD63, NKG7
## Negative: MS4A1, LTB, BANK1, VPREB3, CD22
## PC_ 5
## Positive: FOS, IL7R, S100A12, DUSP1, FYB
## Negative: HLA-DPB1, HLA-DQA1, HLA-DPA1, HLA-DQB1, MS4A1
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
#6.3 DimHeatmap for Genes by PCs
#PC1 plot
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
#6.4 DimHeatmap for Genes by PCs (15) #PC1-PC15 plots
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
#DimPlot(pbmc, reduction = "pca")
#compiuting the nearest neighbor graph
pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
#computing the clusters
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 43049
## Number of edges: 1369226
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9331
## Number of communities: 17
## Elapsed time: 10 seconds
#RunMap Non-linear dimension reduction(UMAP/Tsne)
#head(Idents(pbmc), 5)
#-----------------------------------
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:13:31 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:13:31 Read 43049 rows and found 10 numeric columns
## 10:13:31 Using Annoy for neighbor search, n_neighbors = 30
## 10:13:31 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:13:34 Writing NN index file to temp file /var/folders/vm/569pw0m92jl40jfnhyq4hrwnfhms6v/T//RtmpNiYAMR/filec3b76eb99b4b
## 10:13:34 Searching Annoy index using 1 thread, search_k = 3000
## 10:13:45 Annoy recall = 100%
## 10:13:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:13:47 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:13:50 Commencing optimization for 200 epochs, with 1783242 positive edges
## 10:14:10 Optimization finished
#Dimplot without selecting Ident( variable)
DimPlot(pbmc, reduction = "umap")
#Dimplot selecting Ident as cell_type
#using the cell_type to create a pca plot
pbmc <- SetIdent(pbmc, value = pbmc@meta.data$cell_type)
DimPlot(pbmc, label = T , repel = T, label.size = 3) + NoLegend()
#Finding differentially expressed features (cluster biomarkers)
#Find differentially expressed features between Myeloid cell and all other cells, only
# search for positive markers
Myeloid.markers <- FindMarkers(object = pbmc, ident.1 = "Myeloid cell", min.pct = 0.25, test.use = "roc", only.pos = TRUE)
head(Myeloid.markers)
## myAUC avg_diff power avg_log2FC pct.1 pct.2
## CST3 0.986 2.316599 0.972 4.496856 0.981 0.099
## TYROBP 0.973 1.926025 0.946 3.346939 0.993 0.161
## FCER1G 0.966 2.003203 0.932 3.683680 0.979 0.118
## LYZ 0.961 2.442739 0.922 4.744123 0.938 0.092
## AIF1 0.961 2.247990 0.922 4.541384 0.938 0.077
## LST1 0.952 2.144867 0.904 4.221021 0.929 0.091
#differential expression between two specific groups of cells, specify the ident.1 and ident.2 parameters
#DE genes between to CD4 cell & CD8 T cell
CD.markers <- FindMarkers(object = pbmc, ident.1 = "CD4 T cell", ident.2 = "CD8 T cell", min.pct = 0.25)
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
FeaturePlot(object = pbmc,
features = c("TREM1", "TREM2", "LILRB4A", "SIGLEC15", "REL", "PDL1", "PDCD1LG2","CD274", "SIGLEC10", "MERTK", "SIRPA", "PDCD1","NT5E","ENTPD1","CD276", "CD83", "TNFR2" ))
## Warning: The following requested variables were not found: LILRB4A, PDL1, TNFR2
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell
VlnPlot(object = pbmc, features = c("TREM1", "TREM2"))
# view results
#Cell marker genes
# find all markers of cluster 1 using default parameters
markers_genes <- FindAllMarkers(pbmc, log2FC.threshold = 0.2, test.use = "wilcox",
min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50,
assay = "RNA")
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell
#select the top 25 up regulated genes for plotting
markers_genes %>%
group_by(cluster) %>%
top_n(-25, p_val_adj) -> top25
top25
## # A tibble: 145 × 7
## # Groups: cluster [5]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 7.41e-11 1.63 0.929 0.302 0.00000100 CD4 T cell CD3D
## 2 8.32e-10 1.59 0.721 0.222 0.0000113 CD4 T cell CD3E
## 3 2.16e- 9 1.36 0.955 0.392 0.0000292 CD4 T cell IL32
## 4 2.51e- 9 1.63 0.703 0.209 0.0000340 CD4 T cell CD3G
## 5 5.94e- 9 1.68 0.819 0.258 0.0000805 CD4 T cell CD2
## 6 2.30e- 8 1.75 0.843 0.321 0.000312 CD4 T cell LTB
## 7 2.31e- 7 2.27 0.365 0.089 0.00313 CD4 T cell PBXIP1
## 8 4.40e- 7 2.86 0.679 0.106 0.00596 CD4 T cell IL7R
## 9 1.80e- 6 1.29 0.797 0.432 0.0243 CD4 T cell LDHB
## 10 1.89e- 6 0.907 0.59 0.324 0.0255 CD4 T cell CD69
## # ℹ 135 more rows
#We can now select the top 25 up regulated genes for plotting
mypar(2, 5, mar = c(4, 6, 3, 1))
for (i in unique(top25$cluster)) {
barplot(sort(setNames(top25$avg_log2FC, top25$gene)[top25$cluster == i], F),
horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i")
abline(v = c(0, 0.25), lty = c(1, 2))
}
markers_genes %>%
group_by(cluster) %>%
top_n(-5, p_val_adj) -> top5
# create a scale.data slot for the selected genes
pbmc <- ScaleData(pbmc, features = as.character(unique(top5$gene)), assay = "RNA")
## Centering and scaling data matrix
## Warning: Different features in new layer data than already exists for
## scale.data
DoHeatmap(pbmc, features = as.character(unique(top5$gene)), group.by = "cell_type",
assay = "RNA")
DotPlot(pbmc, features = rev(as.character(unique(top5$gene))), group.by = "cell_type",
assay = "RNA") + coord_flip()
#find markers for every cluster compared to all remaining cells, report
All.markers <- FindAllMarkers(pbmc, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5)
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell
dim(All.markers)
## [1] 368 7
table(All.markers$cluster)
##
## CD4 T cell ILC CD8 T cell B cell Myeloid cell
## 37 45 43 83 160
All.markers %>% group_by(cluster) %>% top_n(2, avg_log2FC)
## # A tibble: 10 × 7
## # Groups: cluster [5]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 2.86 0.679 0.106 0 CD4 T cell IL7R
## 2 0 1.75 0.843 0.321 0 CD4 T cell LTB
## 3 0 3.68 0.888 0.13 0 ILC GNLY
## 4 0 4.80 0.661 0.032 0 ILC KLRF1
## 5 0 3.13 0.63 0.075 0 CD8 T cell KLRD1
## 6 0 2.92 0.597 0.109 0 CD8 T cell GZMH
## 7 0 5.57 0.933 0.036 0 B cell CD79A
## 8 0 5.65 0.525 0.018 0 B cell DERL3
## 9 0 7.43 0.634 0.008 0 Myeloid cell RETN
## 10 0 7.57 0.61 0.005 0 Myeloid cell FPR1
#Identify the markers in the clusters
FeaturePlot(pbmc, features = c("TREM1", "TREM2", "LILRB4A", "SIGLEC15", "REL", "PDL1", "PDCD1LG2","CD274", "SIGLEC10", "MERTK", "SIRPA", "PDCD1","NT5E","ENTPD1","CD276", "CD83", "TNFR2" ))
## Warning: The following requested variables were not found: LILRB4A, PDL1, TNFR2
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: